Libraries required for this analysis

knitr::opts_chunk$set(fig.align="center") 
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr) 
library(ggplot2)
library(magrittr)  
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)

theme_set(theme_light())

In our experiment, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then asked the user to evaluate the tool on a variety of metrics (confidence in understanding data, confidence in answer, efficiency, ease of use, utility, and overall).

Given a search algorithm (bfs or dfs), an oracle (compassql or dziban), and a dataset (birdstrikes or movies), we would like to predict a user’s average score for a given metric. In addition, we would like to know if the choice of search algorithm and oracle has any meaningful impact on a user’s ratong for these metrics.

Read in and clean data

analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
confidence_metrics = c("confidence.udata", "confidence.ans")
preference_metrics = c("efficiency", "ease.of.use", "utility", "overall")

user_response_data <- read.csv('processed_ptask_responses.csv')
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
user_response_data[,analyses] <- lapply(user_response_data[,analyses],ordered)
user_response_data <- user_response_data %>%
  mutate(
    dataset = as.factor(dataset),
    oracle = as.factor(oracle),
    search = as.factor(search),
    task = as.factor(task)
  )

models <- list()

search_differences <- list()
oracle_differences <- list()
alg_differences <- list()


seed = 12

Analysis for user responses

Confidence in Understanding Data: Building a Model

filename = "confidence_udata"

models$confidence_udata <- brm(
    formula = bf(confidence.udata ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
    prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = filename,
    seed = seed
  )
## Compiling Stan program...
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DBOOST_NO_AUTO_PTR  -include '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/usr/local/include   -fPIC  -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:88:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:1: error: unknown type name 'namespace'
## namespace Eigen {
## ^
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:16: error: expected ';' after top level declarator
## namespace Eigen {
##                ^
##                ;
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
## #include <complex>
##          ^~~~~~~~~
## 3 errors generated.
## make: *** [foo.o] Error 1
## Start sampling

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$confidence_udata)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: confidence.udata ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 236) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 59) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.04      0.17     0.74     1.44 1.00      868     1592
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -1.91      0.42    -2.71    -1.10 1.00     1639
## Intercept[2]              -0.62      0.39    -1.38     0.17 1.00     1643
## Intercept[3]               1.51      0.40     0.75     2.34 1.00     1564
## datasetmovies              0.07      0.32    -0.56     0.68 1.00     1605
## oracledziban               0.28      0.46    -0.61     1.19 1.00     1016
## searchdfs                 -0.28      0.45    -1.15     0.61 1.00     1301
## task2.RetrieveValue        0.40      0.22    -0.02     0.83 1.00     3761
## task3.Prediction           0.27      0.21    -0.13     0.69 1.00     3805
## task4.Exploration          0.75      0.22     0.30     1.17 1.00     3572
## oracledziban:searchdfs     0.49      0.65    -0.80     1.70 1.01      989
##                        Tail_ESS
## Intercept[1]               2086
## Intercept[2]               2052
## Intercept[3]               2134
## datasetmovies              1669
## oracledziban               1787
## searchdfs                  1367
## task2.RetrieveValue        2444
## task3.Prediction           2595
## task4.Exploration          2198
## oracledziban:searchdfs     1331
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$confidence_udata)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$confidence_udata,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$confidence_udata,
  pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for confidence in understanding the data using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data_confidence_udata <- user_response_data %>%
  add_predicted_draws(models$confidence_udata,
                   seed = seed,
                   re_formula = NA) %>%
  group_by(search, oracle, .draw) %>%
  mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
  
confidence_udata_plot <- draw_data_confidence_udata %>%
  ggplot(aes(x = oracle, y = rating)) +
  stat_eye(.width = c(.95, .5)) +
  theme_minimal() +
  coord_cartesian(ylim = c(-2, 2)) +
  facet_grid(. ~ search)

confidence_udata_plot

We can get the numeric values of the interval boundaries shown above with mean_qi

fit_info_confidence_udata <- draw_data_confidence_udata %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_confidence_udata
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql  0.954  0.589  1.29    0.95 mean   qi       
## 2 bfs    dziban     1.09   0.766  1.42    0.95 mean   qi       
## 3 dfs    compassql  0.811  0.433  1.13    0.95 mean   qi       
## 4 dfs    dziban     1.19   0.867  1.5     0.95 mean   qi       
## 5 bfs    compassql  0.954  0.839  1.07    0.5  mean   qi       
## 6 bfs    dziban     1.09   0.983  1.2     0.5  mean   qi       
## 7 dfs    compassql  0.811  0.7    0.933   0.5  mean   qi       
## 8 dfs    dziban     1.19   1.08   1.3     0.5  mean   qi
## Saving 7 x 5 in image

Confidence in Understanding Data: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

confidence_udata_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_udata, seed = seed, re_formula = NA) 
confidence_udata_predictive_data$alg <- paste(confidence_udata_predictive_data$search, confidence_udata_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$confidence_udata <- confidence_udata_predictive_data %>% 
  group_by(search, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = search) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$confidence_udata$metric = "confidence.udata"

search_differences$confidence_udata %>%
      ggplot(aes(x = diff_in_rating, y = "confidence.udata")) +
      xlab(paste0("Expected Difference in Rating (",search_differences$confidence_udata[1,'search'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by oracle.

oracle_differences$confidence_udata <- confidence_udata_predictive_data %>% 
  group_by(oracle, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = oracle) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$confidence_udata$metric = "confidence.udata"

oracle_differences$confidence_udata %>%
      ggplot(aes(x = diff_in_rating, y = "confidence.udata")) +
      xlab(paste0("Expected Difference in Rating (",oracle_differences$confidence_udata[1,'oracle'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

alg_differences$confidence_udata <- confidence_udata_predictive_data %>% 
  group_by(alg, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = alg) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$confidence_udata <- subset(alg_differences$confidence_udata, alg == "dfs compassql - bfs dziban")
alg_differences$confidence_udata$metric = "confidence.udata"

alg_differences$confidence_udata %>%
      ggplot(aes(x = diff_in_rating, y = "confidence.udata")) +
      xlab(paste0("Expected Difference in Rating (",alg_differences$confidence_udata[1,'alg'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Confidence in Answer: Building a Model

filename = "confidence_ans"
models$confidence_ans <- brm(
    formula = bf(confidence.ans ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
    prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = filename,
    seed = seed
  )
## Compiling Stan program...
## Start sampling

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$confidence_ans)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: confidence.ans ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 236) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 59) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.56      0.14     0.27     0.84 1.00      893      915
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -3.09      0.39    -3.88    -2.35 1.00     1937
## Intercept[2]              -2.29      0.32    -2.94    -1.69 1.00     1592
## Intercept[3]              -1.42      0.29    -2.00    -0.87 1.00     1515
## Intercept[4]               0.28      0.28    -0.28     0.84 1.00     1783
## datasetmovies             -0.11      0.21    -0.52     0.31 1.00     2005
## oracledziban               0.41      0.31    -0.19     1.03 1.00     1149
## searchdfs                  0.15      0.31    -0.47     0.76 1.00     1104
## task2.RetrieveValue       -0.27      0.22    -0.69     0.15 1.00     2386
## task3.Prediction          -0.98      0.23    -1.43    -0.53 1.00     2104
## task4.Exploration         -0.63      0.22    -1.07    -0.21 1.00     2341
## oracledziban:searchdfs    -0.15      0.44    -1.01     0.71 1.00      891
##                        Tail_ESS
## Intercept[1]               2069
## Intercept[2]               1917
## Intercept[3]               1533
## Intercept[4]               2179
## datasetmovies              1900
## oracledziban               1570
## searchdfs                  1675
## task2.RetrieveValue        2147
## task3.Prediction           1996
## task4.Exploration          1954
## oracledziban:searchdfs     1110
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$confidence_ans)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$confidence_ans,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$confidence_ans,
  pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for confidence in answer using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data_confidence_ans <- user_response_data %>%
  add_predicted_draws(models$confidence_ans,
                   seed = seed,
                   re_formula = NA) %>%
  group_by(search, oracle, .draw) %>%
  mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
  
confidence_ans_plot <- draw_data_confidence_ans %>%
  ggplot(aes(x = oracle, y = rating)) +
  stat_eye(.width = c(.95, .5)) +
  theme_minimal() +
  coord_cartesian(ylim = c(-2, 2)) +
  facet_grid(. ~ search)

confidence_ans_plot

We can get the numeric values of the interval boundaries shown above with mean_qi

fit_info_confidence_ans <- draw_data_confidence_ans %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_confidence_ans
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql  0.958  0.589   1.29   0.95 mean   qi       
## 2 bfs    dziban     1.21   0.917   1.48   0.95 mean   qi       
## 3 dfs    compassql  1.05   0.7     1.35   0.95 mean   qi       
## 4 dfs    dziban     1.21   0.9     1.5    0.95 mean   qi       
## 5 bfs    compassql  0.958  0.839   1.07   0.5  mean   qi       
## 6 bfs    dziban     1.21   1.12    1.32   0.5  mean   qi       
## 7 dfs    compassql  1.05   0.933   1.17   0.5  mean   qi       
## 8 dfs    dziban     1.21   1.12    1.32   0.5  mean   qi
## Saving 7 x 5 in image

Confidence in Answer: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

confidence_ans_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_ans, seed = seed, re_formula = NA) 
confidence_ans_predictive_data$alg <- paste(confidence_ans_predictive_data$search, confidence_ans_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$confidence_ans <- confidence_ans_predictive_data %>% 
  group_by(search, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = search) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$confidence_ans$metric = "confidence.ans"

search_differences$confidence_ans %>%
      ggplot(aes(x = diff_in_rating, y = "confidence.ans")) +
      xlab(paste0("Expected Difference in Rating (",search_differences$confidence_ans[1,'search'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by oracle.

oracle_differences$confidence_ans <- confidence_ans_predictive_data %>% 
  group_by(oracle, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = oracle) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$confidence_ans$metric = "confidence.ans"

oracle_differences$confidence_ans %>%
      ggplot(aes(x = diff_in_rating, y = "confidence.ans")) +
      xlab(paste0("Expected Difference in Rating (",oracle_differences$confidence_ans[1,'oracle'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

alg_differences$confidence_ans <- confidence_ans_predictive_data %>% 
  group_by(alg, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = alg) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$confidence_ans <- subset(alg_differences$confidence_ans, alg == "dfs compassql - bfs dziban")
alg_differences$confidence_ans$metric = "confidence.ans"

alg_differences$confidence_ans %>%
      ggplot(aes(x = diff_in_rating, y = "confidence.ans")) +
      xlab(paste0("Expected Difference in Rating (",alg_differences$confidence_ans[1,'alg'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Efficiency: Building a Model

filename = "efficiency"
models$efficiency <- brm(
    formula = bf(efficiency ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = filename,
    seed = seed
  )
## Compiling Stan program...
## Start sampling

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$efficiency)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: efficiency ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 236) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 59) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.06      0.16     0.77     1.41 1.00      881     1518
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -2.50      0.43    -3.38    -1.69 1.00      809
## Intercept[2]              -0.94      0.40    -1.74    -0.17 1.00      814
## Intercept[3]              -0.11      0.40    -0.89     0.66 1.00      816
## Intercept[4]               1.12      0.40     0.32     1.90 1.00      874
## datasetmovies              0.19      0.31    -0.47     0.81 1.00      762
## oracledziban               0.07      0.46    -0.86     0.97 1.00      673
## searchdfs                 -1.00      0.46    -1.94    -0.11 1.00      640
## task2.RetrieveValue       -0.28      0.20    -0.69     0.11 1.00     2389
## task3.Prediction           0.19      0.20    -0.21     0.58 1.00     2177
## task4.Exploration          0.35      0.21    -0.04     0.76 1.00     2171
## oracledziban:searchdfs     0.60      0.63    -0.65     1.87 1.00      655
##                        Tail_ESS
## Intercept[1]               1318
## Intercept[2]               1596
## Intercept[3]               1578
## Intercept[4]               1628
## datasetmovies               939
## oracledziban               1136
## searchdfs                  1245
## task2.RetrieveValue        2108
## task3.Prediction           2170
## task4.Exploration          2106
## oracledziban:searchdfs     1047
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$efficiency)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$efficiency,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$efficiency,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for efficiency using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data_efficiency <- user_response_data %>%
  add_predicted_draws(models$efficiency,
                   seed = seed,
                   re_formula = NA) %>%
  group_by(search, oracle, .draw) %>%
  mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
  
efficiency_plot <- draw_data_efficiency %>%
  ggplot(aes(x = oracle, y = rating)) +
  stat_eye(.width = c(.95, .5)) +
  theme_minimal() +
  coord_cartesian(ylim = c(-2, 2)) +
  facet_grid(. ~ search)

efficiency_plot

We can get the numeric values of the interval boundaries shown above with mean_qi

fit_info_efficiency <- draw_data_efficiency %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_efficiency
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower  .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>   <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql  0.622  0.0357  1.23     0.95 mean   qi       
## 2 bfs    dziban     0.686  0.1     1.22     0.95 mean   qi       
## 3 dfs    compassql -0.234 -0.783   0.333    0.95 mean   qi       
## 4 dfs    dziban     0.340 -0.217   0.917    0.95 mean   qi       
## 5 bfs    compassql  0.622  0.411   0.839    0.5  mean   qi       
## 6 bfs    dziban     0.686  0.5     0.883    0.5  mean   qi       
## 7 dfs    compassql -0.234 -0.433  -0.0333   0.5  mean   qi       
## 8 dfs    dziban     0.340  0.133   0.55     0.5  mean   qi
## Saving 7 x 5 in image

Efficiency: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

efficiency_predictive_data <- user_response_data %>% add_predicted_draws(models$efficiency, seed = seed, re_formula = NA) 
efficiency_predictive_data$alg <- paste(efficiency_predictive_data$search, efficiency_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$efficiency <- efficiency_predictive_data %>% 
  group_by(search, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = search) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$efficiency$metric = "efficiency"

search_differences$efficiency %>%
      ggplot(aes(x = diff_in_rating, y = "efficiency")) +
      xlab(paste0("Expected Difference in Rating (",search_differences$efficiency[1,'search'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by oracle.

oracle_differences$efficiency <- efficiency_predictive_data %>% 
  group_by(oracle, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = oracle) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$efficiency$metric = "efficiency"

oracle_differences$efficiency %>%
      ggplot(aes(x = diff_in_rating, y = "efficiency")) +
      xlab(paste0("Expected Difference in Rating (",oracle_differences$efficiency[1,'oracle'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

alg_differences$efficiency <- efficiency_predictive_data %>% 
  group_by(alg, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = alg) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$efficiency <- subset(alg_differences$efficiency, alg == "dfs compassql - bfs dziban")
alg_differences$efficiency$metric = "efficiency"


alg_differences$efficiency %>%
      ggplot(aes(x = diff_in_rating, y = "efficiency")) +
      xlab(paste0("Expected Difference in Rating (",alg_differences$efficiency[1,'alg'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Ease of Use: Building a Model

filename = "ease_of_use"
models$ease_of_use <- brm(
    formula = bf(ease.of.use ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = filename,
    seed = seed
  )
## Compiling Stan program...
## Start sampling

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$ease_of_use)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: ease.of.use ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 236) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 59) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.03      0.16     0.75     1.37 1.00      957     1411
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -2.76      0.44    -3.63    -1.93 1.00     1453
## Intercept[2]              -1.25      0.39    -2.00    -0.51 1.00     1410
## Intercept[3]              -0.38      0.38    -1.13     0.36 1.00     1401
## Intercept[4]               1.49      0.40     0.74     2.28 1.00     1378
## datasetmovies              0.39      0.32    -0.23     1.01 1.00     1321
## oracledziban              -0.18      0.44    -1.05     0.67 1.00     1231
## searchdfs                 -1.19      0.43    -2.05    -0.36 1.00     1125
## task2.RetrieveValue        0.29      0.21    -0.12     0.71 1.00     3396
## task3.Prediction           0.45      0.21     0.05     0.86 1.00     3060
## task4.Exploration          0.50      0.21     0.10     0.90 1.00     3108
## oracledziban:searchdfs     0.70      0.61    -0.51     1.94 1.00     1181
##                        Tail_ESS
## Intercept[1]               1714
## Intercept[2]               1652
## Intercept[3]               1611
## Intercept[4]               1487
## datasetmovies              1557
## oracledziban               1632
## searchdfs                  1697
## task2.RetrieveValue        2232
## task3.Prediction           2398
## task4.Exploration          2263
## oracledziban:searchdfs     1690
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$ease_of_use)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$ease_of_use,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$ease_of_use,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for ease of use using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data_ease_of_use <- user_response_data %>%
  add_predicted_draws(models$ease_of_use,
                   seed = seed,
                   re_formula = NA) %>%
  group_by(search, oracle, .draw) %>%
  mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
  
ease_of_use_plot <- draw_data_ease_of_use %>%
  ggplot(aes(x = oracle, y = rating)) +
  stat_eye(.width = c(.95, .5)) +
  theme_minimal() +
  coord_cartesian(ylim = c(-2, 2)) +
  facet_grid(. ~ search)

ease_of_use_plot

We can get the numeric values of the interval boundaries shown above with mean_qi

fit_info_ease_of_use <- draw_data_ease_of_use %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_ease_of_use
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql 0.918   0.482   1.30    0.95 mean   qi       
## 2 bfs    dziban    0.815   0.35    1.2     0.95 mean   qi       
## 3 dfs    compassql 0.0884 -0.417   0.567   0.95 mean   qi       
## 4 dfs    dziban    0.478  -0.05    0.95    0.95 mean   qi       
## 5 bfs    compassql 0.918   0.786   1.05    0.5  mean   qi       
## 6 bfs    dziban    0.815   0.683   0.967   0.5  mean   qi       
## 7 dfs    compassql 0.0884 -0.0833  0.267   0.5  mean   qi       
## 8 dfs    dziban    0.478   0.317   0.65    0.5  mean   qi
## Saving 7 x 5 in image

Ease of Use: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

ease_of_use_predictive_data <- user_response_data %>% add_predicted_draws(models$ease_of_use, seed = seed, re_formula = NA) 
ease_of_use_predictive_data$alg <- paste(ease_of_use_predictive_data$search, ease_of_use_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$ease_of_use <- ease_of_use_predictive_data %>% 
  group_by(search, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = search) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$ease_of_use$metric = "ease.of.use"

search_differences$ease_of_use %>%
      ggplot(aes(x = diff_in_rating, y = "ease.of.use")) +
      xlab(paste0("Expected Difference in Rating (",search_differences$ease_of_use[1,'search'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by oracle.

oracle_differences$ease_of_use <- ease_of_use_predictive_data %>% 
  group_by(oracle, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = oracle) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$ease_of_use$metric = "ease.of.use"

oracle_differences$ease_of_use %>%
      ggplot(aes(x = diff_in_rating, y = "ease.of.use")) +
      xlab(paste0("Expected Difference in Rating (",oracle_differences$ease_of_use[1,'oracle'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

alg_differences$ease_of_use <- ease_of_use_predictive_data %>% 
  group_by(alg, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = alg) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$ease_of_use <- subset(alg_differences$ease_of_use, alg == "dfs compassql - bfs dziban")
alg_differences$ease_of_use$metric = "ease.of.use"

alg_differences$ease_of_use %>%
      ggplot(aes(x = diff_in_rating, y = "ease.of.use")) +
      xlab(paste0("Expected Difference in Rating (",alg_differences$ease_of_use[1,'alg'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Utility: Building a Model

filename = "utility"
models$utility <- brm(
    formula = bf(utility ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = filename,
    seed = seed
  )
## Compiling Stan program...
## Start sampling

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$utility)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: utility ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 236) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 59) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.93      0.15     0.67     1.25 1.00      692     1577
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -1.75      0.37    -2.52    -1.02 1.00      678
## Intercept[2]              -0.51      0.36    -1.22     0.22 1.00      732
## Intercept[3]               0.14      0.36    -0.58     0.84 1.00      708
## Intercept[4]               1.38      0.37     0.68     2.14 1.00      762
## datasetmovies              0.27      0.28    -0.28     0.84 1.00      917
## oracledziban               0.19      0.42    -0.65     1.02 1.00      778
## searchdfs                 -0.60      0.40    -1.35     0.23 1.00      761
## task2.RetrieveValue       -0.21      0.20    -0.62     0.18 1.00     1717
## task3.Prediction           0.32      0.20    -0.07     0.71 1.00     1781
## task4.Exploration          0.58      0.21     0.17     0.97 1.00     1777
## oracledziban:searchdfs     0.27      0.59    -0.89     1.40 1.00      683
##                        Tail_ESS
## Intercept[1]                925
## Intercept[2]               1371
## Intercept[3]               1381
## Intercept[4]               1285
## datasetmovies              1292
## oracledziban               1106
## searchdfs                  1214
## task2.RetrieveValue        2148
## task3.Prediction           2217
## task4.Exploration          2030
## oracledziban:searchdfs      995
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$utility)

s plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$utility,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$utility,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for Utility using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data_utility <- user_response_data %>%
  add_predicted_draws(models$utility,
                   seed = seed,
                   re_formula = NA) %>%
  group_by(search, oracle, .draw) %>%
  mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
  
utility_plot <- draw_data_utility %>%
  ggplot(aes(x = oracle, y = rating)) +
  stat_eye(.width = c(.95, .5)) +
  theme_minimal() +
  coord_cartesian(ylim = c(-2, 2)) +
  facet_grid(. ~ search)

utility_plot

We can get the numeric values of the interval boundaries shown above with mean_qi

fit_info_utility <- draw_data_utility %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_utility
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle     rating .lower .upper .width .point .interval
##   <fct>  <fct>       <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql  0.464  -0.143  1.02    0.95 mean   qi       
## 2 bfs    dziban     0.641   0.05   1.2     0.95 mean   qi       
## 3 dfs    compassql -0.0936 -0.667  0.483   0.95 mean   qi       
## 4 dfs    dziban     0.345  -0.25   0.917   0.95 mean   qi       
## 5 bfs    compassql  0.464   0.268  0.661   0.5  mean   qi       
## 6 bfs    dziban     0.641   0.45   0.833   0.5  mean   qi       
## 7 dfs    compassql -0.0936 -0.3    0.1     0.5  mean   qi       
## 8 dfs    dziban     0.345   0.15   0.55    0.5  mean   qi
## Saving 7 x 5 in image

Utility: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

utility_predictive_data <- user_response_data %>% add_predicted_draws(models$utility, seed = seed, re_formula = NA) 
utility_predictive_data$alg <- paste(utility_predictive_data$search, utility_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$utility <- utility_predictive_data %>% 
  group_by(search, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = search) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$utility$metric = "utility"

search_differences$utility %>%
      ggplot(aes(x = diff_in_rating, y = "utility")) +
      xlab(paste0("Expected Difference in Rating (",search_differences$utility[1,'search'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by oracle.

oracle_differences$utility <- utility_predictive_data %>% 
  group_by(oracle, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = oracle) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$utility$metric = "utility"

oracle_differences$utility %>%
      ggplot(aes(x = diff_in_rating, y = "utility")) +
      xlab(paste0("Expected Difference in Rating (",oracle_differences$utility[1,'oracle'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

alg_differences$utility <- utility_predictive_data %>% 
  group_by(alg, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = alg) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$utility <- subset(alg_differences$utility, alg == "dfs compassql - bfs dziban")
alg_differences$utility$metric = "utility"

alg_differences$utility %>%
      ggplot(aes(x = diff_in_rating, y = "utility")) +
      xlab(paste0("Expected Difference in Rating (",alg_differences$utility[1,'alg'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Overall: Building a Model

filename = "overall"
models$overall <- brm(
    formula = bf(overall ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = filename,
    seed = seed
  )
## Compiling Stan program...
## Start sampling

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$overall)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: overall ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 236) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 59) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.40      0.20     1.05     1.81 1.01      898     1147
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -3.05      0.54    -4.17    -1.98 1.00     1132
## Intercept[2]              -1.68      0.49    -2.68    -0.71 1.00     1053
## Intercept[3]              -0.39      0.47    -1.32     0.54 1.00     1022
## Intercept[4]               1.69      0.49     0.74     2.67 1.00     1025
## datasetmovies             -0.17      0.41    -0.98     0.62 1.00     1032
## oracledziban               0.10      0.57    -0.99     1.23 1.00      842
## searchdfs                 -0.71      0.57    -1.82     0.41 1.00      885
## task2.RetrieveValue       -0.03      0.21    -0.43     0.37 1.00     3706
## task3.Prediction           0.38      0.21    -0.04     0.79 1.00     3639
## task4.Exploration          0.66      0.22     0.24     1.10 1.00     3240
## oracledziban:searchdfs     0.38      0.81    -1.25     1.95 1.00      867
##                        Tail_ESS
## Intercept[1]               1454
## Intercept[2]               1394
## Intercept[3]               1398
## Intercept[4]               1523
## datasetmovies              1507
## oracledziban               1557
## searchdfs                  1292
## task2.RetrieveValue        2394
## task3.Prediction           2516
## task4.Exploration          2627
## oracledziban:searchdfs     1228
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$overall)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$overall,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$overall,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for Overall using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data_overall <- user_response_data %>%
  add_predicted_draws(models$overall,
                   seed = seed,
                   re_formula = NA) %>%
  group_by(search, oracle, .draw) %>%
  mutate(rating = weighted.mean(as.numeric(as.character(.prediction))))
  
overall_plot <- draw_data_overall %>%
  ggplot(aes(x = oracle, y = rating)) +
  stat_eye(.width = c(.95, .5)) +
  theme_minimal() +
  coord_cartesian(ylim = c(-2, 2)) +
  facet_grid(. ~ search)

overall_plot

We can get the numeric values of the interval boundaries shown above with mean_qi

fit_info_overall <- draw_data_overall %>% group_by(search, oracle) %>% mean_qi(rating, .width = c(.95, .5))
fit_info_overall
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql  0.723  0.196  1.16    0.95 mean   qi       
## 2 bfs    dziban     0.774  0.283  1.2     0.95 mean   qi       
## 3 dfs    compassql  0.294 -0.267  0.783   0.95 mean   qi       
## 4 dfs    dziban     0.584  0.05   1.03    0.95 mean   qi       
## 5 bfs    compassql  0.723  0.571  0.893   0.5  mean   qi       
## 6 bfs    dziban     0.774  0.633  0.933   0.5  mean   qi       
## 7 dfs    compassql  0.294  0.117  0.483   0.5  mean   qi       
## 8 dfs    dziban     0.584  0.433  0.767   0.5  mean   qi
## Saving 7 x 5 in image

Overall: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

overall_predictive_data <- user_response_data %>% add_predicted_draws(models$overall, seed = seed, re_formula = NA) 
overall_predictive_data$alg <- paste(overall_predictive_data$search, overall_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$overall <- overall_predictive_data %>% 
  group_by(search, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = search) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$overall$metric = "overall"

search_differences$overall %>%
      ggplot(aes(x = diff_in_rating, y = "overall")) +
      xlab(paste0("Expected Difference in Rating (",search_differences$overall[1,'search'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by oracle.

oracle_differences$overall <- overall_predictive_data %>% 
  group_by(oracle, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = oracle) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$overall$metric = "overall"

oracle_differences$overall %>%
      ggplot(aes(x = diff_in_rating, y = "overall")) +
      xlab(paste0("Expected Difference in Rating (",oracle_differences$overall[1,'oracle'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

alg_differences$overall <- overall_predictive_data %>% 
  group_by(alg, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = alg) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$overall <- subset(alg_differences$overall, alg == "dfs compassql - bfs dziban")
alg_differences$overall$metric = "overall"

alg_differences$overall %>%
      ggplot(aes(x = diff_in_rating, y = "overall")) +
      xlab(paste0("Expected Difference in Rating (",alg_differences$overall[1,'alg'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

Summary Plots

Putting the all of the plots for search algorithm and oracle differences together, split by whether the rating metric is of type confidence or preference We’ll start with differences in search algorithms.

Differences in Search Algorithms

combined_search_differences <- rbind(search_differences$confidence_udata, search_differences$confidence_ans, search_differences$efficiency, search_differences$ease_of_use, search_differences$utility, search_differences$overall)

combined_search_differences$metric <- factor(combined_search_differences$metric, levels=rev(analyses))

# flip order so that we get bfs - dfs
if(combined_search_differences[1,'search']=="dfs - bfs"){
  combined_search_differences$search = 'bfs - dfs'
  combined_search_differences$diff_in_rating = -1 * combined_search_differences$diff_in_rating
}

combined_search_differences_confidence <- subset(combined_search_differences, metric %in% confidence_metrics)
search_differences_plot_confidence <- combined_search_differences_confidence %>%
      ggplot(aes(x = diff_in_rating, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_search_differences_confidence[1,'search'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

search_differences_plot_confidence

View intervals

fit_info_search_differences_confidence <- combined_search_differences_confidence %>% group_by(search, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_search_differences_confidence
## # A tibble: 4 x 8
## # Groups:   search [1]
##   search    metric         diff_in_rating  .lower .upper .width .point .interval
##   <chr>     <fct>                   <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs - dfs confidence.ans        -0.0398 -0.352  0.268    0.95 mean   qi       
## 2 bfs - dfs confidence.ud…         0.0261 -0.312  0.370    0.95 mean   qi       
## 3 bfs - dfs confidence.ans        -0.0398 -0.147  0.0649   0.5  mean   qi       
## 4 bfs - dfs confidence.ud…         0.0261 -0.0917 0.145    0.5  mean   qi
combined_search_differences_preference <- subset(combined_search_differences, metric %in% preference_metrics)
search_differences_plot_preference <- combined_search_differences_preference %>%
      ggplot(aes(x = diff_in_rating, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_search_differences_preference[1,'search'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()
search_differences_plot_preference

View intervals

fit_info_search_differences_preference <- combined_search_differences_preference %>% group_by(search, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_search_differences_preference
## # A tibble: 8 x 8
## # Groups:   search [1]
##   search    metric      diff_in_rating  .lower .upper .width .point .interval
##   <chr>     <fct>                <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs - dfs overall              0.310 -0.185   0.814   0.95 mean   qi       
## 2 bfs - dfs utility              0.430 -0.158   1.00    0.95 mean   qi       
## 3 bfs - dfs ease.of.use          0.581  0.114   1.06    0.95 mean   qi       
## 4 bfs - dfs efficiency           0.602  0.0224  1.19    0.95 mean   qi       
## 5 bfs - dfs overall              0.310  0.152   0.470   0.5  mean   qi       
## 6 bfs - dfs utility              0.430  0.239   0.631   0.5  mean   qi       
## 7 bfs - dfs ease.of.use          0.581  0.423   0.739   0.5  mean   qi       
## 8 bfs - dfs efficiency           0.602  0.396   0.805   0.5  mean   qi

Differences in Oracle

combined_oracle_differences <- rbind(oracle_differences$confidence_udata, oracle_differences$confidence_ans, oracle_differences$efficiency, oracle_differences$ease_of_use, oracle_differences$utility, oracle_differences$overall)

combined_oracle_differences$metric <- factor(combined_oracle_differences$metric, levels=rev(analyses))

combined_oracle_differences_confidence <- subset(combined_oracle_differences, metric %in% confidence_metrics)
oracle_differences_plot_confidence <- combined_oracle_differences_confidence %>%
      ggplot(aes(x = diff_in_rating, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_oracle_differences_confidence[1,'oracle'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

oracle_differences_plot_confidence

View intervals

fit_info_oracle_differences_confidence <- combined_oracle_differences_confidence %>% group_by(oracle, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_oracle_differences_confidence
## # A tibble: 4 x 8
## # Groups:   oracle [1]
##   oracle       metric      diff_in_rating  .lower .upper .width .point .interval
##   <chr>        <fct>                <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dziban - co… confidence…          0.207 -0.105   0.527   0.95 mean   qi       
## 2 dziban - co… confidence…          0.261 -0.0652  0.604   0.95 mean   qi       
## 3 dziban - co… confidence…          0.207  0.105   0.312   0.5  mean   qi       
## 4 dziban - co… confidence…          0.261  0.143   0.378   0.5  mean   qi
combined_oracle_differences_preference <- subset(combined_oracle_differences, metric %in% preference_metrics)
oracle_differences_plot_preference <- combined_oracle_differences_preference %>%
      ggplot(aes(x = diff_in_rating, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_oracle_differences_preference[1,'oracle'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()
oracle_differences_plot_preference

View intervals

fit_info_oracle_differences_preference <- combined_oracle_differences_preference %>% group_by(oracle, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_oracle_differences_preference
## # A tibble: 8 x 8
## # Groups:   oracle [1]
##   oracle         metric   diff_in_rating   .lower .upper .width .point .interval
##   <chr>          <fct>             <dbl>    <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dziban - comp… overall           0.178 -0.295    0.670   0.95 mean   qi       
## 2 dziban - comp… utility           0.317 -0.266    0.871   0.95 mean   qi       
## 3 dziban - comp… ease.of…          0.158 -0.297    0.618   0.95 mean   qi       
## 4 dziban - comp… efficie…          0.334 -0.237    0.923   0.95 mean   qi       
## 5 dziban - comp… overall           0.178  0.0138   0.338   0.5  mean   qi       
## 6 dziban - comp… utility           0.317  0.122    0.512   0.5  mean   qi       
## 7 dziban - comp… ease.of…          0.158 -0.00158  0.318   0.5  mean   qi       
## 8 dziban - comp… efficie…          0.334  0.142    0.527   0.5  mean   qi

dfs compassql vs bfs dziban

combined_alg_differences <- rbind(alg_differences$confidence_udata, alg_differences$confidence_ans, alg_differences$efficiency, alg_differences$ease_of_use, alg_differences$utility, alg_differences$overall)
combined_alg_differences$metric <- factor(combined_alg_differences$metric, levels=rev(analyses))


# flip order so that we get bfs - dfs
if(combined_alg_differences[1,'alg']=="dfs - bfs"){
  combined_alg_differences$alg = 'bfs - dfs'
  combined_alg_differences$diff_in_rating = -1 * combined_alg_differences$diff_in_rating
}

combined_alg_differences_confidence <- subset(combined_alg_differences, metric %in% confidence_metrics)
alg_differences_plot_confidence <- combined_alg_differences_confidence %>%
      ggplot(aes(x = diff_in_rating, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_alg_differences_confidence[1,'alg'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

alg_differences_plot_confidence

View intervals

fit_info_alg_differences_confidence <- combined_alg_differences_confidence %>% group_by(alg, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_alg_differences_confidence
## # A tibble: 4 x 8
## # Groups:   alg [1]
##   alg            metric    diff_in_rating .lower  .upper .width .point .interval
##   <chr>          <fct>              <dbl>  <dbl>   <dbl>  <dbl> <chr>  <chr>    
## 1 dfs compassql… confiden…         -0.165 -0.6    0.283    0.95 mean   qi       
## 2 dfs compassql… confiden…         -0.282 -0.783  0.217    0.95 mean   qi       
## 3 dfs compassql… confiden…         -0.165 -0.3   -0.0167   0.5  mean   qi       
## 4 dfs compassql… confiden…         -0.282 -0.450 -0.117    0.5  mean   qi
combined_alg_differences_preference <- subset(combined_alg_differences, metric %in% preference_metrics)
alg_differences_plot_preference <- combined_alg_differences_preference %>%
      ggplot(aes(x = diff_in_rating, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_alg_differences_preference[1,'alg'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()
alg_differences_plot_preference

View intervals

fit_info_alg_differences_preference <- combined_alg_differences_preference %>% group_by(alg, metric) %>% mean_qi(diff_in_rating, .width = c(.95, .5))
fit_info_alg_differences_preference
## # A tibble: 8 x 8
## # Groups:   alg [1]
##   alg             metric   diff_in_rating .lower  .upper .width .point .interval
##   <chr>           <fct>             <dbl>  <dbl>   <dbl>  <dbl> <chr>  <chr>    
## 1 dfs compassql … overall          -0.480 -1.17   0.2      0.95 mean   qi       
## 2 dfs compassql … utility          -0.734 -1.53   0.0833   0.95 mean   qi       
## 3 dfs compassql … ease.of…         -0.727 -1.38  -0.0500   0.95 mean   qi       
## 4 dfs compassql … efficie…         -0.920 -1.70  -0.117    0.95 mean   qi       
## 5 dfs compassql … overall          -0.480 -0.717 -0.233    0.5  mean   qi       
## 6 dfs compassql … utility          -0.734 -1     -0.467    0.5  mean   qi       
## 7 dfs compassql … ease.of…         -0.727 -0.95  -0.5      0.5  mean   qi       
## 8 dfs compassql … efficie…         -0.920 -1.2   -0.650    0.5  mean   qi